This document describes an estimate of the positive predictive value (PPV) of variant discovery for all variants in the gene KCNH2 on long QT syndrome type 2 (LQT2). Additional details on the methods used are published Kroncke et al. 2020 PLOS Genetics and at the following website: (https://oates.app.vumc.org/vancart/SCN5A/SCN5A-report.html). We use observed and estimated probability of LQT2 diagnosis for all known KCNH2 variants as a way to assess the per variant PPV for variant discovery. Our objective is to develope a prior estimate of the per variant PPV on LQT2 which incorporates structure, function, and in silico predictors. We use these in silico and in vitro data to generate a Bayesian prior estimate of the per variant PPV since these data can be generated in a lab setting, unlike heterozygotes/carriers of KCNH2 variants which may or may not exist. The final posterior estimate combines this derived prior and clinically phenotyped heterozygotes/carriers.
# mut_type has includes type and isoform for all variants in the literature and in the cohort.
mut_type <- mut_type[mut_type$mut_type == "missense",]
# Cohort carrier/heterozygote counts and variant IDs.
# Here we select only the missense variants (in-frame indels are also included as "missense")
cohort.data <- cohort.data[cohort.data$mut_type == "missense" & cohort.data$total_carriers > 0,]
# Literature dataset where potentially overlapping carriers/heterozygotes are removed
d <- lit.nonoverlap.data[lit.nonoverlap.data$mut_type == "missense",]
# Here, heterozygotes/carriers from gnomAD are removed to test their influence on performance
# Remove comments in next two lines and complete subsequent evaluation to assess influence of gnomAD on
# calculations.
#d$total_carriers <- d$total_carriers - d$gnomAD # test influence of gnomAD
#d$unaff <- d$total_carriers - d$lqt2 # test influence of gnomAD
# set initial weighting and penetrance
d$weight = 1-1/(0.01+d$total_carriers)
d$penetrance_lqt2 <- d$lqt2/d$total_carriers
d[d$total_carriers < 1,"weight"] <- 0.000 # This is changed to "< 2" when evaluating ROC-AUC of n=1 variants from the literatureUse observed LQT2 diagnosis probability to calculate “LQTS probability density” as described in previous publication. Plot probability density versus residue
# Mean squared error
mse <- function(sm) {
mean((sm$residuals)^2*(sm$weights))
}
# Weighted mean to determine LQT2 penetrance empirical prior
newdata = data.frame(wt=1)
model <- lm(penetrance_lqt2 ~ 1, data=d, weights = d$weight)
summary(model)##
## Call:
## lm(formula = penetrance_lqt2 ~ 1, data = d, weights = d$weight)
##
## Weighted Residuals:
## Min 1Q Median 3Q Max
## -0.2329 -0.1653 -0.0232 0.0763 0.7561
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.2332 0.0135 17.27 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2323 on 789 degrees of freedom
## (467 observations deleted due to missingness)
p<-predict(model, newdata)
dev<- mse(model)#p*(1-p)
# Derive alpha and beta from weighted mean and MSE (estimated variance)
estBetaParams <- function(mu, var) {
alpha <- ((1 - mu) / var - 1 / mu) * mu ^ 2
beta <- alpha * (1 / mu - 1)
return(params = list(alpha = alpha, beta = beta))
}
# Estimated shape parameters for LQT2 empirical prior
alpha0 = estBetaParams(p,dev)$alpha
beta0 = estBetaParams(p,dev)$beta
print(paste("alpha0 = ", alpha0, " beta0 = ", beta0))## [1] "alpha0 = 0.54041284982238 beta0 = 1.77720039413882"
# Bayesian LQT2 penetrance estimates from empirical priors
# and observed affected/unaffected counts:
d$lqt2_penetranceBayesian_initial <- (alpha0 + d[,"lqt2"])/((alpha0 + beta0 + d[,"total_carriers"]))
d$lqt2_penetranceBayesian<-d$lqt2_penetranceBayesian_initial
# Plot literature observed LQT2 penetrance versus residue number
m<- d %>%
select(resnum, pmean = penetrance_lqt2) %>%
mutate(p_mean_smooth = rollmean(pmean, k=20, fill = NA))
fit <- loess(d[,"penetrance_lqt2"]~as.numeric(d[,"resnum"]), span = 0.15)
plot(d$resnum, d$penetrance_lqt2, xlab ="Residue", ylab = "LQT2 Penetrance Estimate")
xrange <- seq(min(fit$x), max(fit$x), length.out = 100)
ps <- predict(fit, xrange, se=T)
lines(xrange, ps$fit*1, lwd=5)
lines(xrange, (ps$fit+1.96*ps$se.fit)*1, lty=2, lwd=4)
lines(xrange, (ps$fit-1.96*ps$se.fit)*1, lty=2, lwd=4)With the updated empirical priors applied to carrier counts, calculate “LQTS probability density” as described in previous publication.
# NOTE: adjust 3rd argument given to funcdist, "d,", to d[d$total_carriers>1,] when
# evaluating ROC of total_carriers == 1
h2.covariates[, "lqt2_dist"]<-NA
h2.covariates[, "lqt2_dist_weight"]<-NA
mut_type <- mut_type[mut_type$mut_type == "missense" & mut_type$isoform == "A",]
for(rec in 1:nrow(mut_type)){
if (!is.na(mut_type[rec, "resnum"])) {
l<-length(h2.covariates[h2.covariates$var == mut_type[rec, "var"],"var"])
for (m in 1:l){
h2.covariates[h2.covariates$var == mut_type[rec, "var"],
c("lqt2_dist", "lqt2_dist_weight", "resnum")][m,] <- c(funcdist(mut_type[rec, "resnum"], mut_type[rec, "var"], d, h2dist, "penetrance_lqt2", "sigmoid", 7), mut_type[rec, "resnum"])
}
}
}
# Plot lqt2_dist versus residue number
h2.covariates$resnum<-as.integer(h2.covariates$resnum)
h2.covariates <- h2.covariates[order(h2.covariates$resnum),]
h2.covariates <- h2.covariates[!is.na(h2.covariates$resnum),]
m<- h2.covariates %>%
select(resnum, pmean = lqt2_dist) %>%
mutate(p_mean_smooth = rollmean(pmean, k=20, fill = NA))
fit <- loess(h2.covariates[,"lqt2_dist"]~as.numeric(h2.covariates[,"resnum"]), span = 0.15)
plot(h2.covariates$resnum, h2.covariates$lqt2_dist, xlab ="Residue", ylab = "LQT2 Penetrance Density", xlim=c(0,1160))
xrange <- seq(min(fit$x), max(fit$x), length.out = 100)
ps <- predict(fit, xrange, se=T)
lines(xrange, ps$fit*1, lwd=5)
lines(xrange, (ps$fit+1.96*ps$se.fit)*1, lty=2, lwd=4)
lines(xrange, (ps$fit-1.96*ps$se.fit)*1, lty=2, lwd=4)Evaluate weighted Spearman correlations coefficients between observed LQT2 diagnosis probability in the literature and various potential predictors
# Merge "d" with full variant list and set carrier counts to 0.
# This is done for convenience so we can estimate LQT2 diagnosis probability for all variants including
# those witheld during model construction. This will make validation easier.
d <- merge(d, h2.covariates, all = TRUE)
d <- unique(d)
calcPval=function(xName,yName,weightName,nPerms,new.mat2){
# Pulls out variables
x=new.mat2[,xName]
y=new.mat2[,yName]
w=new.mat2[,weightName]
x2=x[!is.na(x)]
y2=y[!is.na(x)]
w2=w[!is.na(x)]
# Calculate the real correlation
realCorr=weightedCorr(x2,y2,method='spearman',weights=w2)
# Do permutations, calculate fake correlations
permutedCorrList=c()
for(permNum in 1:nPerms){
permutedX=sample(x2,length(x2),replace=FALSE)
wCorrSim=weightedCorr(permutedX,y2,method='spearman',weights=w2)
permutedCorrList=c(permutedCorrList,wCorrSim)
}
permutedCorrList2=abs(permutedCorrList)
realCorr2=abs(realCorr)
# Calculate pvalue
summ=sum(realCorr2<permutedCorrList2)
pValue=summ/nPerms
return(list(realCorr,pValue,length(x2)))
}
calcAllPvals=function(yList,xList,nPerms,weightName,new.mat2){
i=0
resultTable=data.frame()
for(yName in yList){
for(xName in xList){
i=i+1
result=calcPval(xName,yName,weightName,nPerms,new.mat2)
resultTable[i,'x']=xName
resultTable[i,'y']=yName
resultTable[i,'nPerms']=nPerms
resultTable[i,'weightedCorr']=result[[1]]
resultTable[i,'pValue']=result[[2]]
resultTable[i,'n']=result[[3]]
#print(resultTable[i,'pValue'])
}
}
print(resultTable)
return(resultTable)
}
yList=c('penetrance_lqt2')
xList=c('hm_ssPeak','hm_tailPeak','hm_vhalfact','hm_vhalfinact','hm_recovfrominact', 'hm_taudeact_fast',
'ht_ssPeak','ht_tailPeak','ht_vhalfact','ht_vhalfinact','ht_recovfrominact', 'ht_taudeact_fast',
'pph2_prob', 'provean_score', "blast_pssm",
'pamscore', 'aasimilaritymat', "lqt2_dist", "revel_score")
tmp<-d[!is.na(d$penetrance_lqt2) & !is.na(d$revel_score),]
resultTable<-calcAllPvals(yList, xList, 1000, 'weight', tmp)## x y nPerms weightedCorr pValue n
## 1 hm_ssPeak penetrance_lqt2 1000 0.319565106 0.264 20
## 2 hm_tailPeak penetrance_lqt2 1000 -0.564758635 0.000 100
## 3 hm_vhalfact penetrance_lqt2 1000 0.147612664 0.293 68
## 4 hm_vhalfinact penetrance_lqt2 1000 0.669666858 0.001 29
## 5 hm_recovfrominact penetrance_lqt2 1000 -0.734693865 0.007 12
## 6 hm_taudeact_fast penetrance_lqt2 1000 -0.375296727 0.025 42
## 7 ht_ssPeak penetrance_lqt2 1000 -0.464626144 0.025 34
## 8 ht_tailPeak penetrance_lqt2 1000 -0.616133062 0.000 83
## 9 ht_vhalfact penetrance_lqt2 1000 -0.061932733 0.710 41
## 10 ht_vhalfinact penetrance_lqt2 1000 -0.173738913 0.475 26
## 11 ht_recovfrominact penetrance_lqt2 1000 -0.407537689 0.407 6
## 12 ht_taudeact_fast penetrance_lqt2 1000 0.009703229 0.975 14
## 13 pph2_prob penetrance_lqt2 1000 0.392919452 0.000 741
## 14 provean_score penetrance_lqt2 1000 -0.585053878 0.000 741
## 15 blast_pssm penetrance_lqt2 1000 -0.250452543 0.000 741
## 16 pamscore penetrance_lqt2 1000 -0.116758403 0.033 750
## 17 aasimilaritymat penetrance_lqt2 1000 0.018401459 0.719 750
## 18 lqt2_dist penetrance_lqt2 1000 0.583417065 0.000 748
## 19 revel_score penetrance_lqt2 1000 0.664409462 0.000 750
tmp<-d[!is.na(d$ht_tailPeak) & !is.na(d$penetrance_lqt2) & !is.na(d$revel_score),]
resultTable<-calcAllPvals(yList, xList, 1000, 'weight', tmp)## x y nPerms weightedCorr pValue n
## 1 hm_ssPeak penetrance_lqt2 1000 0.690504615 0.097 10
## 2 hm_tailPeak penetrance_lqt2 1000 -0.428867090 0.004 66
## 3 hm_vhalfact penetrance_lqt2 1000 -0.093938286 0.659 35
## 4 hm_vhalfinact penetrance_lqt2 1000 0.629956576 0.049 15
## 5 hm_recovfrominact penetrance_lqt2 1000 -0.772690799 0.294 4
## 6 hm_taudeact_fast penetrance_lqt2 1000 -0.228864954 0.368 17
## 7 ht_ssPeak penetrance_lqt2 1000 -0.464626144 0.022 34
## 8 ht_tailPeak penetrance_lqt2 1000 -0.616133062 0.000 83
## 9 ht_vhalfact penetrance_lqt2 1000 -0.061096836 0.748 40
## 10 ht_vhalfinact penetrance_lqt2 1000 -0.173348992 0.438 25
## 11 ht_recovfrominact penetrance_lqt2 1000 -0.407537689 0.424 6
## 12 ht_taudeact_fast penetrance_lqt2 1000 0.009703229 0.980 14
## 13 pph2_prob penetrance_lqt2 1000 0.332107233 0.008 81
## 14 provean_score penetrance_lqt2 1000 -0.455185289 0.000 81
## 15 blast_pssm penetrance_lqt2 1000 0.055910704 0.653 81
## 16 pamscore penetrance_lqt2 1000 0.058560727 0.655 83
## 17 aasimilaritymat penetrance_lqt2 1000 0.063049344 0.610 83
## 18 lqt2_dist penetrance_lqt2 1000 0.755839461 0.000 83
## 19 revel_score penetrance_lqt2 1000 0.612183017 0.000 83
Use an EM algorithm to estimate LQT2 PPV for each variant (or LQT2 diagnosis probability)
# Literature dataset where potentially overlapping carriers/heterozygotes are removed
d <- lit.nonoverlap.data[lit.nonoverlap.data$mut_type == "missense",]
# set initial weighting and penetrance
d$weight = 1-1/(0.01+d$total_carriers)
d$penetrance_lqt2 <- d$lqt2/d$total_carriers
d[d$total_carriers < 2,"weight"] <- 0.000 # This is changed to "< 2" here to evaluate ROC-AUC of n=1 variants from the literatureUse observed LQT2 diagnosis probability to calculate “LQTS probability density” as described in previous publication. Plot LQTS probability density versus residue
# Weighted mean to determine LQT2 penetrance empirical prior
newdata = data.frame(wt=1)
model <- lm(penetrance_lqt2 ~ 1, data=d, weights = d$weight)
summary(model)##
## Call:
## lm(formula = penetrance_lqt2 ~ 1, data = d, weights = d$weight)
##
## Weighted Residuals:
## Min 1Q Median 3Q Max
## -0.2323 -0.1648 0.0000 0.0000 0.7567
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.23253 0.01893 12.28 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3236 on 399 degrees of freedom
## (467 observations deleted due to missingness)
p<-predict(model, newdata)
dev<- mse(model) #p*(1-p)
# Estimated shape parameters for LQT2 empirical prior
alpha0 = estBetaParams(p,dev)$alpha
beta0 = estBetaParams(p,dev)$beta
print(paste("alpha0 = ", alpha0, " beta0 = ", beta0))## [1] "alpha0 = 0.552007535426229 beta0 = 1.82190517994969"
# Bayesian LQT2 penetrance estimates from empirical priors
# and observed affected/unaffected counts:
d$lqt2_penetranceBayesian_initial <- (alpha0 + d[,"lqt2"])/((alpha0 + beta0 + d[,"total_carriers"]))
d$lqt2_penetranceBayesian<-d$lqt2_penetranceBayesian_initialWith the updated empirical priors applied to carrier counts, calculate “LQTS probability density” as described in previous publication.
h2.covariates[, "lqt2_dist"]<-NA
h2.covariates[, "lqt2_dist_weight"]<-NA
mut_type <- mut_type[mut_type$mut_type == "missense" & mut_type$isoform == "A",]
for(rec in 1:nrow(mut_type)){
if (!is.na(mut_type[rec, "resnum"])) {
h2.covariates[h2.covariates$var == mut_type[rec, "var"],
c("lqt2_dist", "lqt2_dist_weight", "resnum")] <- c(funcdist(mut_type[rec, "resnum"], mut_type[rec, "var"], d[d$total_carriers>1,], h2dist, "penetrance_lqt2", "sigmoid", 7), mut_type[rec, "resnum"])
}
}
# Plot lqt2_dist versus residue number
h2.covariates$resnum<-as.integer(h2.covariates$resnum)
h2.covariates <- h2.covariates[order(h2.covariates$resnum),]
h2.covariates <- h2.covariates[!is.na(h2.covariates$resnum),]
# Merge "d" with full variant list and set carrier counts to 0.
# This is done for convenience so we can estimate LQT2 penetrance for all variants including
# those witheld during model construction. This will make validation easier.
d <- merge(d, h2.covariates, all = TRUE)
d <- unique(d) Use an EM algorithm to estimate LQT2 PPV for each variant (or LQT2 diagnosis probability)
# Literature dataset where potentially overlapping carriers/heterozygotes are removed
d <- lit.cohort.data[lit.cohort.data$mut_type == "missense" & lit.cohort.data$isoform=="A",]
# add all possible variants
allvariants<-data.frame(unlist(h2.covariates[!is.na(h2.covariates$cardiacboost),"var"]), stringsAsFactors = F)
names(allvariants)<-"var"
allvariants$isoform<-unlist(h2.covariates[!is.na(h2.covariates$cardiacboost),"isoform"])
allvariants$mut_type<-unlist(h2.covariates[!is.na(h2.covariates$cardiacboost),"mut_type"])
a<-merge(d,allvariants,all = T)
a[is.na(a$total_carriers),"total_carriers"] <- 0
a[is.na(a$lqt2),"lqt2"] <- 0
d<-a
# set initial weighting and penetrance
d$weight = 1-1/(0.01+d$total_carriers)
d$penetrance_lqt2 <- d$lqt2/d$total_carriers
d[d$total_carriers < 1,"weight"] <- 0.000 # This is changed to "< 2" here to evaluate ROC-AUC of n=1 variants from the literatureUse observed LQT2 diagnosis probability to calculate “LQTS probability density” as described in previous publication. Plot diagnosis probability density versus residue
# Mean squared error
mse <- function(sm) {
mean((sm$residuals)^2*(sm$weights))
}
# Derive alpha and beta from weighted mean and MSE (estimated variance)
estBetaParams <- function(mu, var) {
alpha <- ((1 - mu) / var - 1 / mu) * mu ^ 2
beta <- alpha * (1 / mu - 1)
return(params = list(alpha = alpha, beta = beta))
}
# Weighted mean to determine LQT2 penetrance empirical prior
newdata = data.frame(wt=1)
model <- lm(penetrance_lqt2 ~ 1, data=d, weights = d$weight)
summary(model)##
## Call:
## lm(formula = penetrance_lqt2 ~ 1, data = d, weights = d$weight)
##
## Weighted Residuals:
## Min 1Q Median 3Q Max
## -0.36142 -0.25639 -0.03599 0.06351 0.61328
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.36170 0.01309 27.64 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2574 on 950 degrees of freedom
## (6240 observations deleted due to missingness)
p<-predict(model, newdata)
dev<- mse(model) #p*(1-p)
# Estimated shape parameters for LQT2 empirical prior
alpha0 = estBetaParams(p,dev)$alpha
beta0 = estBetaParams(p,dev)$beta
print(paste("alpha0 = ", alpha0, " beta0 = ", beta0))## [1] "alpha0 = 0.899678647955565 beta0 = 1.58771266711751"
# Bayesian LQT2 penetrance estimates from empirical priors
# and observed affected/unaffected counts:
d$lqt2_penetranceBayesian_initial <- (alpha0 + d[,"lqt2"])/((alpha0 + beta0 + d[,"total_carriers"]))
d$lqt2_penetranceBayesian<-d$lqt2_penetranceBayesian_initialWith the updated empirical priors applied to carrier counts, calculate “LQTS probability density” as described in previous publication. !!! NOTE: since these data are truly the “best estimates” we include all variants in the calculation such that unique scores are by residue not by variant.
h2.covariates[, "lqt2_dist"]<-NA
h2.covariates[, "lqt2_dist_weight"]<-NA
h2.covariates<-h2.covariates[h2.covariates$isoform=="A",]
ld<-0
for(rec in seq(2,1159,1)){
#print(rec)
ld <- funcdist(rec, "var", d[!is.na(d$total_carriers) & d$total_carriers>0 & d$isoform == "A" & d$mut_type != "nonsense",], h2dist, "penetrance_lqt2", "sigmoid", 7)
h2.covariates[!is.na(h2.covariates$isoform) & !is.na(h2.covariates$resnum) & h2.covariates$isoform=="A" & h2.covariates$resnum == rec & !is.na(h2.covariates$var) & !is.na(h2.covariates$mut_type) & h2.covariates$mut_type == "missense","lqt2_dist"] <- ld[1]
h2.covariates[!is.na(h2.covariates$isoform) & !is.na(h2.covariates$resnum) & h2.covariates$isoform=="A" & h2.covariates$resnum == rec & !is.na(h2.covariates$var)& !is.na(h2.covariates$mut_type) & h2.covariates$mut_type == "missense","lqt2_dist_weight"] <-ld[2]
}
# Plot lqt2_dist versus residue number
h2.covariates$resnum<-as.integer(h2.covariates$resnum)
h2.covariates <- h2.covariates[order(h2.covariates$resnum),]
h2.covariates <- h2.covariates[!is.na(h2.covariates$resnum),]
# Merge "d" with full variant list and set carrier counts to 0.
# This is done for convenience so we can estimate LQT2 penetrance for all variants including
# those witheld during model construction. This will make validation easier.
d <- merge(d, h2.covariates, all = TRUE)
d <- unique(d)
# annotate structural location (hotspot)
d$Structure<-NA
d[!is.na(d$lqt2_dist) & d$lqt2_dist<0.1,"Structure"]<-"Non_Hotspot"
d[!is.na(d$lqt2_dist) & d$lqt2_dist>=0.1 & d$lqt2_dist<0.4,"Structure"]<-"Mild_Hotspot"
d[!is.na(d$lqt2_dist) & d$lqt2_dist>=0.4,"Structure"]<-"Hotspot"
# annotate functional perturbation
d$Function<-NA
d[!is.na(d$ht_tailPeak) & d$ht_tailPeak<0.25,"Function"]<-"Severe Dominant LOF"
d[!is.na(d$ht_tailPeak) & d$ht_tailPeak<0.5 & d$ht_tailPeak>=0.25,"Function"]<-"Dominant LOF"
d[!is.na(d$ht_tailPeak) & d$ht_tailPeak>=0.5 & d$ht_tailPeak<0.75,"Function"]<-"LOF"
d[!is.na(d$ht_tailPeak) & d$ht_tailPeak>=0.75 & d$ht_tailPeak<1.25,"Function"]<-"Normal"
d[!is.na(d$ht_tailPeak) & d$ht_tailPeak>=1.25,"Function"]<-"GOF"Use an EM algorithm to estimate LQT2 PPV for each variant (or LQT2 diagnosis probability)
load("prepared_data.RData")
load("lit_all_data_checkpoint.RData")
cohort.data<-cohort.data[!is.na(cohort.data$resnum) & cohort.data$resnum<2000,]
cohort.data$weight = 1-1/(0.01+cohort.data$total_carriers)
cohort.data$penetrance_lqt2 <- cohort.data$lqt2/cohort.data$total_carriers
for (variant in cohort.data$var) {
if (!is.na(match(variant, d$var))) {
cohort.data[cohort.data$var == variant, c("p_mean_w","alpha", "beta", "lqt2_lit", "unaff_lit", "total_carriers_lit", "lqt2_dist")] <- d[match(variant, d$var), c("p_mean_w", "alpha", "beta", "lqt2", "unaff", "total_carriers", "lqt2_dist")]
}
}
m<- cohort.data %>%
select(resnum, pmean = penetrance_lqt2) %>%
mutate(p_mean_smooth = rollmean(pmean, k=20, fill = NA))
fit <- loess(cohort.data[,"penetrance_lqt2"]~as.numeric(cohort.data[,"resnum"]), span = 0.15)
plot(cohort.data$resnum, cohort.data$penetrance_lqt2, xlab ="Residue", ylab = "LQT2 diagnosis probability", col = "red", pch=19)
xrange <- seq(min(fit$x), max(fit$x), length.out = 100)
ps <- predict(fit, xrange, se=T)
lines(xrange, ps$fit*1, lwd=5)
lines(xrange, (ps$fit+1.96*ps$se.fit)*1, lty=2, lwd=4)
lines(xrange, (ps$fit-1.96*ps$se.fit)*1, lty=2, lwd=4)# Add covariates to cohort dataset
cohort.data <- merge(cohort.data, h2.covariates, all = TRUE)
cohort.data <- unique(cohort.data)
# Save cohort data
save(cohort.data, file = "cohort_checkpoint.RData")Use the observed diagnosis probability from as the TRUE diagnosis probability, generate n binomial observations
Use the final EM algorithm posterior as the prior for Beta-Binomial, incorporate data from step (1), generate the posterior distribution, and get 95% credible interval.
Check whether the interval cover the true diagnosis probability from Step 1.
Repeat Step 1 to Step 3 N times to get the coverage rate.
BootsCoverage <- function(var,n=100,N=1000,true){
# var: variant name
# n: number of subjects in the new data
# N: number of Bootstrap
# extract the "true" diagnosis probability
true.p <- d[d$var==var,true]
# generate binomial data
event <- rbinom(N,n,true.p)
# get the posterior credible interval
alpha <- d$alpha[which(d$var==var)]
beta <- d$beta[which(d$var==var)]
new.alpha <- alpha + event
new.beta <- beta + n - event
lb <- qbeta(0.025,new.alpha,new.beta)
ub <- qbeta(0.975,new.alpha,new.beta)
# change lb to floor of nearest 0.1
lb <- floor(lb*20)/20
ub <- ceiling(ub*20)/20
return(sum(lb < true.p & ub > true.p)/N)
}The coverage plot where observed diagnosis probability is the “true” diagnosis probability and one hundred new observations are added is shown below.
# load data, literature + gnomAD + cohort
load("lit_all_data_checkpoint.RData")
load("cohort_checkpoint.RData")
calcPval=function(xName,yName,weightName,nPerms,new.mat2){
# Pulls out variables
x=new.mat2[,xName]
y=new.mat2[,yName]
w=new.mat2[,weightName]
x2=x[!is.na(x)]
y2=y[!is.na(x)]
w2=w[!is.na(x)]
# Calculate the real correlation
realCorr=weightedCorr(x2,y2,method='spearman',weights=w2)
# Do permutations, calculate fake correlations
permutedCorrList=c()
for(permNum in 1:nPerms){
permutedX=sample(x2,length(x2),replace=FALSE)
wCorrSim=weightedCorr(permutedX,y2,method='spearman',weights=w2)
permutedCorrList=c(permutedCorrList,wCorrSim)
}
permutedCorrList2=abs(permutedCorrList)
realCorr2=abs(realCorr)
# Calculate pvalue
summ=sum(realCorr2<permutedCorrList2)
pValue=summ/nPerms
return(list(realCorr,pValue,length(x2)))
}
calcAllPvals=function(yList,xList,nPerms,weightName,new.mat2){
i=0
resultTable=data.frame()
for(yName in yList){
for(xName in xList){
i=i+1
result=calcPval(xName,yName,weightName,nPerms,new.mat2)
resultTable[i,'x']=xName
resultTable[i,'y']=yName
resultTable[i,'nPerms']=nPerms
resultTable[i,'weightedCorr']=result[[1]]
resultTable[i,'pValue']=result[[2]]
resultTable[i,'n']=result[[3]]
#print(resultTable[i,'pValue'])
}
}
print(resultTable)
return(resultTable)
}
# Select covariates from isoform "A" in cohort dataset
cohort.data <- cohort.data[!is.na(cohort.data$total_carriers) & cohort.data$isoform == "A" & cohort.data$mut_type == "missense",]
cohort.data <- cohort.data[,!names(cohort.data) %in% "cardiacboost"]
cohort.data <- unique(cohort.data)
tmp<-cohort.data[!is.na(cohort.data$provean_score) & !is.na(cohort.data$revel_score) & !is.na(cohort.data$pamscore),]
yList=c('penetrance_lqt2')
xList=c('hm_ssPeak','hm_tailPeak','hm_vhalfact','hm_vhalfinact','hm_recovfrominact', 'hm_taudeact_fast',
'ht_ssPeak','ht_tailPeak','ht_vhalfact','ht_vhalfinact','ht_recovfrominact', 'ht_taudeact_fast',
'pph2_prob', 'provean_score', "blast_pssm",
'pamscore', 'aasimilaritymat', "lqt2_dist", "revel_score", 'p_mean_w')
resultTable<-calcAllPvals(yList, xList, 1000, 'weight', tmp)## x y nPerms weightedCorr pValue n
## 1 hm_ssPeak penetrance_lqt2 1000 -0.13782555 0.755 8
## 2 hm_tailPeak penetrance_lqt2 1000 -0.45322142 0.011 46
## 3 hm_vhalfact penetrance_lqt2 1000 0.17406867 0.431 34
## 4 hm_vhalfinact penetrance_lqt2 1000 0.58824513 0.104 13
## 5 hm_recovfrominact penetrance_lqt2 1000 -0.01793348 0.978 8
## 6 hm_taudeact_fast penetrance_lqt2 1000 0.09551958 0.749 26
## 7 ht_ssPeak penetrance_lqt2 1000 -0.72712827 0.033 10
## 8 ht_tailPeak penetrance_lqt2 1000 -0.74007175 0.000 38
## 9 ht_vhalfact penetrance_lqt2 1000 -0.06920233 0.774 21
## 10 ht_vhalfinact penetrance_lqt2 1000 -0.42544340 0.180 13
## 11 ht_recovfrominact penetrance_lqt2 1000 0.31291231 0.511 4
## 12 ht_taudeact_fast penetrance_lqt2 1000 0.40856426 0.177 13
## 13 pph2_prob penetrance_lqt2 1000 0.29946329 0.000 255
## 14 provean_score penetrance_lqt2 1000 -0.40683895 0.000 255
## 15 blast_pssm penetrance_lqt2 1000 -0.08826634 0.232 255
## 16 pamscore penetrance_lqt2 1000 -0.03980324 0.612 255
## 17 aasimilaritymat penetrance_lqt2 1000 0.09491418 0.207 255
## 18 lqt2_dist penetrance_lqt2 1000 0.48764962 0.000 255
## 19 revel_score penetrance_lqt2 1000 0.46266065 0.000 255
## 20 p_mean_w penetrance_lqt2 1000 0.53708337 0.000 255
rm(tmp)
rm(t)
i=0
tmp<-data.frame()
for (x in xList){
i=i+2
tmp[i-1,"Feature"]<-x
t<-d[!is.na(d[,x]) & d$total_carriers>0,]
t<-t[!is.na(t[,"var"]),]
tmp[i,"Feature"]<-paste(x,"_cohort")
t<-d[!is.na(d[,x]) & d$total_carriers>0,]
t<-t[!is.na(t[,"var"]),]
foo <- boot(t, function(data,indices)
weightedCorr(t[indices,x],t$penetrance_lqt2[indices], method="spearman", weights = t$weight[indices]), R=1000)
tmp[i-1,"Spearman"]<-foo$t0
tmp[i-1,"Spearman_low"]<-quantile(foo$t,c(0.025,0.975), na.rm = T)[1][[1]]
tmp[i-1,"Spearman_high"]<-quantile(foo$t,c(0.025,0.975), na.rm = T)[2][[1]]
tmp[i-1,"n"]<-length(t[,x])
t<-cohort.data[!is.na(cohort.data[,x]) & cohort.data$total_carriers>0,]
t<-t[!is.na(t[,"var"]),]
foo <- boot(t, function(data,indices)
weightedCorr(t[indices,x],t$penetrance_lqt2[indices], method="spearman", weights = t$weight[indices]), R=1000)
tmp[i,"Spearman"]<-foo$t0
tmp[i,"Spearman_low"]<-quantile(foo$t,c(0.025,0.975), na.rm = T)[1][[1]]
tmp[i,"Spearman_high"]<-quantile(foo$t,c(0.025,0.975), na.rm = T)[2][[1]]
tmp[i,"n"]<-length(t[,x])
}
forestplot(tmp$Feature,tmp$Spearman,tmp$Spearman_low,tmp$Spearman_high)# reset "tmp" variable
tmp<-cohort.data[!is.na(cohort.data$provean_score) & !is.na(cohort.data$revel_score) & !is.na(cohort.data$pamscore),]
# Weighted R2 between observed LQT2 penetrance and post-test probability
foo <- boot(tmp, function(data,indices)
weightedCorr(tmp$p_mean_w[indices],tmp$penetrance_lqt2[indices], method="pearson", weights = tmp$weight[indices])^2, R=1000)
print("EM estimated LQT2 diagnosis probability versus observed cohort LQT2 diagnosis probability")## [1] "EM estimated LQT2 diagnosis probability versus observed cohort LQT2 diagnosis probability"
foo$t0## [1] 0.3281077
quantile(foo$t,c(0.025,0.975))## 2.5% 97.5%
## 0.2155762 0.4505238
model <- lm(penetrance_lqt2~lqt2_dist, data = tmp, weights = weight)
mod<-data.frame(model$fitted.values,model$model$penetrance_lqt2,model$weights)
foo <- boot(mod, function(data,indices)
weightedCorr(mod$model.fitted.values[indices],mod$model.model.penetrance_lqt2[indices], method="pearson", weights = mod$model.weights[indices])^2, R=1000)
print("LQT2 diagnosis probability density versus observed cohort LQT2 diagnosis probability")## [1] "LQT2 diagnosis probability density versus observed cohort LQT2 diagnosis probability"
foo$t0## [1] 0.2513792
quantile(foo$t,c(0.025,0.975))## 2.5% 97.5%
## 0.1458213 0.3669153
model <- lm(penetrance_lqt2~revel_score, data = tmp, weights = weight)
mod<-data.frame(model$fitted.values,model$model$penetrance_lqt2,model$weights)
foo <- boot(mod, function(data,indices)
weightedCorr(mod$model.fitted.values[indices],mod$model.model.penetrance_lqt2[indices], method="pearson", weights = mod$model.weights[indices])^2, R=1000)
print("REVEL score versus observed cohort LQT2 diagnosis probability")## [1] "REVEL score versus observed cohort LQT2 diagnosis probability"
foo$t0## [1] 0.2316343
quantile(foo$t,c(0.025,0.975))## 2.5% 97.5%
## 0.1338631 0.3560617
# Evaluate only variants with Heterozygous peak tail current measured.
tmp<-cohort.data[!is.na(cohort.data$ht_tailPeak),]
resultTable<-calcAllPvals(yList, xList, 1000, 'weight', tmp)## x y nPerms weightedCorr pValue n
## 1 hm_ssPeak penetrance_lqt2 1000 0.972864918 0.300 4
## 2 hm_tailPeak penetrance_lqt2 1000 -0.251150899 0.258 28
## 3 hm_vhalfact penetrance_lqt2 1000 -0.005613182 0.988 14
## 4 hm_vhalfinact penetrance_lqt2 1000 -0.587879495 0.300 6
## 5 hm_recovfrominact penetrance_lqt2 1000 -1.000000000 0.000 2
## 6 hm_taudeact_fast penetrance_lqt2 1000 0.154267674 0.797 6
## 7 ht_ssPeak penetrance_lqt2 1000 -0.752439389 0.022 11
## 8 ht_tailPeak penetrance_lqt2 1000 -0.750024456 0.000 39
## 9 ht_vhalfact penetrance_lqt2 1000 0.107075664 0.690 20
## 10 ht_vhalfinact penetrance_lqt2 1000 -0.499958288 0.088 14
## 11 ht_recovfrominact penetrance_lqt2 1000 0.312912311 0.494 4
## 12 ht_taudeact_fast penetrance_lqt2 1000 0.558320083 0.085 11
## 13 pph2_prob penetrance_lqt2 1000 0.366819707 0.040 38
## 14 provean_score penetrance_lqt2 1000 -0.292846391 0.125 38
## 15 blast_pssm penetrance_lqt2 1000 0.172315938 0.356 38
## 16 pamscore penetrance_lqt2 1000 -0.002273073 0.987 39
## 17 aasimilaritymat penetrance_lqt2 1000 0.247567799 0.171 39
## 18 lqt2_dist penetrance_lqt2 1000 0.666753766 0.000 39
## 19 revel_score penetrance_lqt2 1000 0.709843622 0.000 39
## 20 p_mean_w penetrance_lqt2 1000 0.763687875 0.000 39
foo <- boot(tmp, function(data,indices)
weightedCorr(tmp$p_mean_w[indices],tmp$penetrance_lqt2[indices], method="pearson", weights = tmp$weight[indices])^2, R=1000)
print("EM estimated LQT2 diagnosis probability versus observed cohort LQT2 diagnosis probability")## [1] "EM estimated LQT2 diagnosis probability versus observed cohort LQT2 diagnosis probability"
foo$t0## [1] 0.5640682
quantile(foo$t,c(0.025,0.975))## 2.5% 97.5%
## 0.2843941 0.7730929
model <- lm(penetrance_lqt2~ht_tailPeak, data = tmp, weights = weight)
mod<-data.frame(model$fitted.values,model$model$penetrance_lqt2,model$weights)
foo <- boot(mod, function(data,indices)
weightedCorr(mod$model.fitted.values[indices],mod$model.model.penetrance_lqt2[indices], method="pearson", weights = mod$model.weights[indices])^2, R=1000)
print("Heterozygously measured peak tail current versus observed cohort LQT2 diagnosis probability")## [1] "Heterozygously measured peak tail current versus observed cohort LQT2 diagnosis probability"
foo$t0## [1] 0.4953738
quantile(foo$t,c(0.025,0.975))## 2.5% 97.5%
## 0.2951170 0.6951972
model <- lm(penetrance_lqt2~lqt2_dist, data = tmp, weights = weight)
mod<-data.frame(model$fitted.values,model$model$penetrance_lqt2,model$weights)
foo <- boot(mod, function(data,indices)
weightedCorr(mod$model.fitted.values[indices],mod$model.model.penetrance_lqt2[indices], method="pearson", weights = mod$model.weights[indices])^2, R=1000)
print("LQT2 probability density versus observed cohort LQT2 diagnosis probability")## [1] "LQT2 probability density versus observed cohort LQT2 diagnosis probability"
foo$t0## [1] 0.4031363
quantile(foo$t,c(0.025,0.975))## 2.5% 97.5%
## 0.1663152 0.6468372
model <- lm(penetrance_lqt2~revel_score, data = tmp, weights = weight)
mod<-data.frame(model$fitted.values,model$model$penetrance_lqt2,model$weights)
foo <- boot(mod, function(data,indices)
weightedCorr(mod$model.fitted.values[indices],mod$model.model.penetrance_lqt2[indices], method="pearson", weights = mod$model.weights[indices])^2, R=1000)
print("REVEL score versus observed cohort LQT2 diagnosis probability")## [1] "REVEL score versus observed cohort LQT2 diagnosis probability"
foo$t0## [1] 0.4568439
quantile(foo$t,c(0.025,0.975))## 2.5% 97.5%
## 0.2465301 0.7131307
load("lit_all_data_checkpoint.RData")
tmp<-d[!is.na(d$provean_score) & !is.na(d$penetrance_lqt2) & !is.na(d$lqt2_dist) & !is.na(d$revel_score),]
foo <- boot(tmp, function(data,indices)
weightedCorr(tmp$p_mean_w[indices],tmp$penetrance_lqt2[indices], method="pearson", weights = tmp$weight[indices])^2, R=1000)
print("EM estimated LQT2 diagnosis probability versus observed literature LQT2 diagnosis probability")## [1] "EM estimated LQT2 diagnosis probability versus observed literature LQT2 diagnosis probability"
foo$t0## [1] 0.8168311
quantile(foo$t,c(0.025,0.975))## 2.5% 97.5%
## 0.7677140 0.8596818
model <- lm(penetrance_lqt2~lqt2_dist, data = tmp, weights = weight)
mod<-data.frame(model$fitted.values,model$model$penetrance_lqt2,model$weights)
foo <- boot(mod, function(data,indices)
weightedCorr(mod$model.fitted.values[indices],mod$model.model.penetrance_lqt2[indices], method="pearson", weights = mod$model.weights[indices])^2, R=1000)
print("LQT2 probability density versus observed literature LQT2 diagnosis probability")## [1] "LQT2 probability density versus observed literature LQT2 diagnosis probability"
foo$t0## [1] 0.4902899
quantile(foo$t,c(0.025,0.975))## 2.5% 97.5%
## 0.3887100 0.5914084
model <- lm(penetrance_lqt2~revel_score, data = tmp, weights = weight)
mod<-data.frame(model$fitted.values,model$model$penetrance_lqt2,model$weights)
foo <- boot(mod, function(data,indices)
weightedCorr(mod$model.fitted.values[indices],mod$model.model.penetrance_lqt2[indices], method="pearson", weights = mod$model.weights[indices])^2, R=1000)
print("REVEL versus observed literature LQT2 diagnosis probability")## [1] "REVEL versus observed literature LQT2 diagnosis probability"
foo$t0## [1] 0.3920345
quantile(foo$t,c(0.025,0.975))## 2.5% 97.5%
## 0.3249512 0.4562831
# Evaluate only variants with Heterozygous peak tail current measured.
tmp<-d[!is.na(d$ht_tailPeak) & !is.na(d$provean_score) & !is.na(d$penetrance_lqt2) & !is.nan(d$penetrance_lqt2) & !is.na(d$lqt2_dist),]
foo <- boot(tmp, function(data,indices)
weightedCorr(tmp$p_mean_w[indices],tmp$penetrance_lqt2[indices], method="pearson", weights = tmp$weight[indices])^2, R=1000)
print("EM estimated LQT2 diagnosis probability versus observed literature LQT2 diagnosis probability")## [1] "EM estimated LQT2 diagnosis probability versus observed literature LQT2 diagnosis probability"
foo$t0## [1] 0.8884971
quantile(foo$t,c(0.025,0.975))## 2.5% 97.5%
## 0.8053722 0.9438262
model <- lm(penetrance_lqt2~lqt2_dist, data = tmp, weights = weight)
mod<-data.frame(model$fitted.values,model$model$penetrance_lqt2,model$weights)
foo <- boot(mod, function(data,indices)
weightedCorr(mod$model.fitted.values[indices],mod$model.model.penetrance_lqt2[indices], method="pearson", weights = mod$model.weights[indices])^2, R=1000)
print("LQT2 probability density versus observed literature LQT2 diagnosis probability")## [1] "LQT2 probability density versus observed literature LQT2 diagnosis probability"
foo$t0## [1] 0.5945224
quantile(foo$t,c(0.025,0.975))## 2.5% 97.5%
## 0.4220788 0.7419627
model <- lm(penetrance_lqt2~ht_tailPeak, data = tmp, weights = weight)
mod<-data.frame(model$fitted.values,model$model$penetrance_lqt2,model$weights)
foo <- boot(mod, function(data,indices)
weightedCorr(mod$model.fitted.values[indices],mod$model.model.penetrance_lqt2[indices], method="pearson", weights = mod$model.weights[indices])^2, R=1000)
print("Heterozygous peak tail current versus observed literature LQT2 diagnosis probability")## [1] "Heterozygous peak tail current versus observed literature LQT2 diagnosis probability"
foo$t0## [1] 0.3390998
quantile(foo$t,c(0.025,0.975))## 2.5% 97.5%
## 0.1814097 0.5573245
model <- lm(penetrance_lqt2~revel_score, data = tmp, weights = weight)
mod<-data.frame(model$fitted.values,model$model$penetrance_lqt2,model$weights)
foo <- boot(mod, function(data,indices)
weightedCorr(mod$model.fitted.values[indices],mod$model.model.penetrance_lqt2[indices], method="pearson", weights = mod$model.weights[indices])^2, R=1000)
print("REVEL versus observed literature LQT2 diagnosis probability")## [1] "REVEL versus observed literature LQT2 diagnosis probability"
foo$t0## [1] 0.4286368
quantile(foo$t,c(0.025,0.975))## 2.5% 97.5%
## 0.2418773 0.6251751
AUCs from ROCs predicting EM posteriors
## [1] "263 218"
## [1] "263 216"
## [1] "263 214"
## [1] "263 212"
## [1] "263 205"
## [1] "263 193"
## [1] "263 193"
## [1] "263 190"
## [1] "263 190"
## [1] "263 160"
## [1] "263 158"
## [1] "263 153"
## [1] "263 135"
## [1] "263 135"
## [1] "263 127"
## Area under the curve: 0.6648
## Area under the curve: 0.5209
## Area under the curve: 0.7175
## Area under the curve: 0.5836
## Area under the curve: 0.8093
## Area under the curve: 0.8228
## [1] "741 234"
## [1] "741 231"
## [1] "741 229"
## [1] "741 223"
## [1] "741 218"
## [1] "741 211"
## [1] "741 208"
## [1] "741 205"
## [1] "741 205"
## [1] "741 193"
## [1] "741 192"
## [1] "741 190"
## [1] "741 183"
## [1] "741 179"
## [1] "741 174"
## [1] "338 94"
## [1] "338 94"
## [1] "338 94"
## [1] "338 94"
## [1] "338 94"
## [1] "338 94"
## [1] "338 94"
## [1] "338 94"
## [1] "338 94"
## [1] "338 94"
## [1] "338 94"
## [1] "338 94"
## [1] "338 94"
## [1] "338 94"
## [1] "338 94"
The code to produce the forest plots is shown in the code chunk.
rm(d)
# Loading combined literature, gnomAD, and cohort dataset
load("lit_all_data_checkpoint.RData")
d<-d[d$total_carriers>0 & d$mut_type!="nonsense" & d$isoform=="A",]
d<-d[!is.na(d$var),]
mean.post <- (d$alpha + d$lqt2)/(d$alpha+d$beta+d$total_carriers)
mean.prior <- (d$alpha)/(d$alpha+d$beta)
lower.prior <- qbeta(0.025,d$alpha,d$beta)
higher.prior <- qbeta(0.975,d$alpha,d$beta)
lower.post <- qbeta(0.025,d$alpha+d$lqt2,d$beta+d$total_carriers-d$lqt2)
higher.post <- qbeta(0.975,d$alpha+d$lqt2,d$beta+d$total_carriers-d$lqt2)
forest.data.post <- data.frame(variant = d$var, mean=mean.post,
lower=lower.post, higher=higher.post, resnum=d$resnum, tc=d$total_carriers, lqt2=d$lqt2)
forest.data.post$group<-"posterior"
forest.data.prior <- data.frame(variant = d$var, mean=mean.prior,
lower=lower.prior, higher=higher.prior, resnum=d$resnum, tc=d$total_carriers, lqt2=d$lqt2)
forest.data.prior$group<-"prior"
forest.data<-rbind(forest.data.post, forest.data.prior)
forest.data<-forest.data[order(forest.data$resnum, forest.data$variant),]
forest.data$label<-""
forest.data[forest.data$group=="posterior","label"]<-paste(forest.data[forest.data$group=="posterior","lqt2"], "/", forest.data[forest.data$group=="posterior","tc"])
#define colours for dots and bars
dotCOLS = c("#866D4B","#000000")
barCOLS = c("#FFFFFF","#FFFFFF")
plotg <- function(a,b){
fd<-forest.data[a:b,]
png( paste("Plots/", a, "-",b,"pics.png",sep=""),res=300,height=10,width=10,units="in")
p<-ggplot(fd, aes(x=reorder(variant,-resnum), y=mean, ymin=lower, ymax=higher, col=group, fill=group)) +
geom_text(data=fd, aes(x=reorder(variant,-resnum), label=label)) +
#specify position here
geom_linerange(size=2,position=position_dodge(width = 1)) +
geom_hline(yintercept=1, lty=1) +
geom_hline(yintercept=0, lty=1) +
#specify position here too
geom_point(size=2, shape=21, colour="white", stroke = 0.5,position=position_dodge(width = 1)) +
scale_fill_manual(values=barCOLS)+
scale_color_manual(values=dotCOLS)+
scale_y_continuous(name="LQT2 Diagnosis Probability", limits = c(0, 1)) +
coord_flip() +
theme_minimal()
print(p)
dev.off()
}
sapply(0:30*50+1,function(x) plotg(x,x+49) )The forests plots are pasted below, where gold bars are posteriors, and black bars are EM priors. Variant name is given to the left and the number of LQT2 affected / total heterozygote count is given above the dot indicating the posterior mean.